Plasma only samples

import pandas as pd
samples = pd.read_csv('data/sample_sheet.csv')
# must be Healthy Control Study and must pass MiSeq QC
samples = samples.loc[(samples['Study'] == 'Healthy Controls') & (samples['MISEQ.QC.PASS'] == 'PASS')]  
samples = samples.set_index('MT.Unique.ID').sort_values(by=['Participant.ID', 'Source'])
# capitalize & strip whitespace for consistency
for column in ['Gender', 'Race', 'Source']:
    samples[column] = samples[column].str.capitalize()
    samples[column] = samples[column].str.strip()
# use correct ontology terms
race_ontology = {'Asian': 'Asian',
 'Black or african american': 'African_American',
 'Mixed/asian & white': 'Multiracial',
 'Mixed/asian &black': 'Multiracial',
 'Mixed/black, white, asian': 'Multiracial',
 'Native hawiian or other pacific islander': 'Pacific_Islander',
 'Pacific islander': 'Pacific_Islander',
 'White': 'White'}
for id in samples.index:
    race = samples.at[id, 'Race']
    samples.at[id, 'Race'] = race_ontology[race] if race in race_ontology else 'Multiracial'
# get only the plasma samples
mir_counts = pd.read_csv("data/get_canonical/canon_mir_counts.csv", index_col=0)
samples.index = pd.Index(['X' + str(row) for row in samples.index])
plasma_samples = samples.loc[samples['Source'] == "Plasma"]
plasma_mir_counts = mir_counts[plasma_samples.index]
plasma_samples.to_csv('data/plasma_samples.csv')
plasma_mir_counts.to_csv('data/plasma_mir_counts.csv')

Load the sample data and miRNA counts

samples <- read.csv('data/plasma_samples.csv')
counts <- read.csv('data/plasma_mir_counts.csv')
samples <- subset(samples, Library.Generation.Set != "SetRecheck" )  # exclude SetRecheck samples
samples <- samples[samples$X != "X11" & samples$X != "X92",] # remove outliers
samples$Participant.ID <- factor(samples$Participant.ID)  # ID is categorical, not numerical
rownames(counts) = counts$X
rownames(samples) = samples$X
counts$X <- NULL  # remove extra column
samples$X <- NULL
counts <- counts[,unique(rownames(samples))]
head(samples)
head(counts)

Filter

library(edgeR)
design <- model.matrix(~0+Race+Gender+Age, samples)
dge = DGEList(counts = counts, samples = samples)
# require miRNAs to have CPM > 1 in at least 2 samples
countsPerMillion <- edgeR::cpm(dge)
countCheck <- countsPerMillion > 1
head(countCheck)
                 X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X14   X15   X16   X17   X18   X19   X20   X21   X22   X23   X25
hsa-let-7a-3p  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
hsa-let-7a-5p  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-3p  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-5p  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7c-3p FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
                X26  X27   X28   X29   X30   X31   X32   X33   X34   X35   X36   X37   X38   X39   X40   X41  X42   X43   X44   X45   X46   X48
hsa-let-7a-3p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7a-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-3p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
                X49  X50   X52   X53   X54  X55   X56   X57   X58   X59   X60   X61  X62   X63   X64   X65   X66   X67   X68   X69   X76   X77
hsa-let-7a-3p  TRUE TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7a-5p  TRUE TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-3p  TRUE TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-5p  TRUE TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p  TRUE TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
                X79  X80   X81   X82   X83   X84   X85  X86   X87   X88   X89   X90   X91  X93   X96   X99  X100  X102  X103  X104  X105  X106
hsa-let-7a-3p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7a-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-3p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7b-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hsa-let-7c-3p FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
hsa-let-7c-5p  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
keep <- which(rowSums(countCheck) >= 2) 
dge <- dge[keep,]

Explore variance

library(SingleCellExperiment)
library(scater)
reads_sce <- SingleCellExperiment(assays=list(counts=dge$counts),  colData=dge$samples)
# remove unexpressed miRNAs
keep_feature <- rowSums(counts(reads_sce) > 0) > 0
reads_sce <- reads_sce[keep_feature, ]
reads_sce <- calculateQCMetrics(reads_sce)
Note that the names of some metrics have changed, see 'Renamed metrics' in ?calculateQCMetrics.
Old names are currently maintained for back-compatibility, but may be removed in future releases.
# log transform
cpm(reads_sce) <- calculateCPM(reads_sce)
reads_sce <- normalize(reads_sce)
using library sizes as size factors
logcounts(reads_sce) <- log2(calculateCPM(reads_sce) + 1)
# visualize
hist(reads_sce$total_counts, breaks=100)  # counts per sample

hist(reads_sce$total_features, breaks=100)  # counts per miRNA

plotQC(reads_sce, type = "highest-expression")

plotQC(reads_sce, type="explanatory-variables", variables=c("Index", "Participant.ID", "Collection.Date", "Library.Generation.Set", "MiSeq.QC.Run", 'total_features', "Age", "Race", "Sex"))
variable Sex not found in colData(object).
                     Please make sure colData(object)[, variable] exists. This variable will not be plotted.

Examine sources of variation

plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Library.Generation.Set", size_by = "total_features")
non-plotting arguments like 'exprs_values' should go in 'run_args'

plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Race", shape_by="Gender", size_by = "total_features")
non-plotting arguments like 'exprs_values' should go in 'run_args'

for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "logcounts", variable = var)
    )  + ggtitle(var) 
  }

Normalize & Remove unwanted sources of variation

library(RUVSeq)
library(ggplot2)
library(mvoutlier)
dge <- calcNormFactors(dge)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
res <- residuals(fit, type="deviance")
set <- newSeqExpressionSet(dge$counts, phenoData=dge$samples)
ruvr_sets <- list()
for(k in 1:5) {
  ruvr_sets[[k]] <- RUVr(set, row.names(dge), k=k, res)
  assay(reads_sce, paste("RUVr k=", toString(k))) <- log2(t(t(assayData(ruvr_sets[[k]])$normalizedCounts) / colSums(assayData(ruvr_sets[[k]])$normalizedCounts) * 1e6) + 1)
}
for(n in assayNames(reads_sce)) {
  print(
        plotPCA(
            reads_sce,
            colour_by = "Library.Generation.Set",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Batch",n))
  )
  print(
        plotPCA(
            reads_sce,
            colour_by = "Race",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Demographics", n))        
  )
}
non-plotting arguments like 'exprs_values' should go in 'run_args'

Detect outliers

reads_sce <- runPCA(reads_sce, use_coldata = TRUE, detect_outliers = TRUE)
failed to find 'pct_counts_feature_control' in column metadatafailed to find 'total_features_by_counts_feature_control' in column metadatafailed to find 'log10_total_counts_endogenous' in column metadatafailed to find 'log10_total_counts_feature_control' in column metadata
outliers <- colnames(reads_sce)[reads_sce$outlier]
head(outliers)
character(0)

Examine sources of variance after removing unwanted variation

for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Source", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "RUVr k= 2", variable = var)
    )  + ggtitle(var) 
}
Error in findImportantPCs(object, ...) : 
  variable only has one unique value, so cannot determine important
             principal components.

Visualize top highly-expressed miRNAs male-vs-female

library(RColorBrewer)
library(reshape2)
# Ryan's code, modified
norm.expr.matr <- exprs(reads_sce)
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.male.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Male"]))
mean.expr.female.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Female"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.male.rank <= top_N | mean.expr.female.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Gender <- ""
for (row_num in 1:nrow(norm.expr.melt)){  # Pull the Gender values from the column metadata.
  mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
  norm.expr.melt[row_num, "Gender"] <- as.character(samples[mt_unique_id,"Gender"])
}
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.y = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.x = element_blank()) + coord_flip()

Visualize top highly-expressed miRNAs by Race

DE analysis with EdgeR

design <- model.matrix(~ 0 + Race + dge$Gender$Male + dge$Gender$Female + Age + W_1 + W_2, pData(ruvr2))
Error in model.frame.default(object, data, xlev = xlev) : 
  invalid type (NULL) for variable 'dge$Gender$Male'

Contrast gender

lrt <- glmLRT(fit, coef = "GenderMale")
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Gender <- pData(ruvr2)[,c("Gender")]
annotations <- as.data.frame(Gender)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}

Contrast race

lrt <- glmLRT(fit, contrast=makeContrasts(asn=(Asian-(African_American+White)/2),
                                          afr=(African_American-(Asian+White)/2),
                                          wht=(White-(African_American+Asian)/2)), levels = design)
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Race <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Race)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}

Create a PCA plot showing Age x Gender

plotPCA(
            reads_sce,
            colour_by = "Age",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = "RUVr k= 2"
) + ggtitle("RUVSeq-Normalized Expression (k=2)") 
non-plotting arguments like 'exprs_values' should go in 'run_args'

save.image("diff_expr_plasma_only.RData")
---
title: "miRNA Differential expression analysis for plasma samples"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---
# Plasma only samples
```{python3}
import pandas as pd
samples = pd.read_csv('data/sample_sheet.csv')
# must be Healthy Control Study and must pass MiSeq QC
samples = samples.loc[(samples['Study'] == 'Healthy Controls') & (samples['MISEQ.QC.PASS'] == 'PASS')]  
samples = samples.set_index('MT.Unique.ID').sort_values(by=['Participant.ID', 'Source'])
# capitalize & strip whitespace for consistency
for column in ['Gender', 'Race', 'Source']:
    samples[column] = samples[column].str.capitalize()
    samples[column] = samples[column].str.strip()
# use correct ontology terms
race_ontology = {'Asian': 'Asian',
 'Black or african american': 'African_American',
 'Mixed/asian & white': 'Multiracial',
 'Mixed/asian &black': 'Multiracial',
 'Mixed/black, white, asian': 'Multiracial',
 'Native hawiian or other pacific islander': 'Pacific_Islander',
 'Pacific islander': 'Pacific_Islander',
 'White': 'White'}
for id in samples.index:
    race = samples.at[id, 'Race']
    samples.at[id, 'Race'] = race_ontology[race] if race in race_ontology else 'Multiracial'
# get only the plasma samples
mir_counts = pd.read_csv("data/get_canonical/canon_mir_counts.csv", index_col=0)
samples.index = pd.Index(['X' + str(row) for row in samples.index])
plasma_samples = samples.loc[samples['Source'] == "Plasma"]
plasma_mir_counts = mir_counts[plasma_samples.index]
plasma_samples.to_csv('data/plasma_samples.csv')
plasma_mir_counts.to_csv('data/plasma_mir_counts.csv')
```

## Load the sample data and miRNA counts
```{r}
samples <- read.csv('data/plasma_samples.csv')
counts <- read.csv('data/plasma_mir_counts.csv')
samples <- subset(samples, Library.Generation.Set != "SetRecheck" )  # exclude SetRecheck samples
samples <- samples[samples$X != "X11" & samples$X != "X92",] # remove outliers
samples$Participant.ID <- factor(samples$Participant.ID)  # ID is categorical, not numerical
rownames(counts) = counts$X
rownames(samples) = samples$X
counts$X <- NULL  # remove extra column
samples$X <- NULL
counts <- counts[,unique(rownames(samples))]
head(samples)
head(counts)
```

## Filter
```{r}
library(edgeR)
design <- model.matrix(~0+Race+Gender+Age, samples)
dge = DGEList(counts = counts, samples = samples)
# require miRNAs to have CPM > 1 in at least 2 samples
countsPerMillion <- edgeR::cpm(dge)
countCheck <- countsPerMillion > 1
head(countCheck)
keep <- which(rowSums(countCheck) >= 2) 
dge <- dge[keep,]
```
## Explore variance
```{r}
library(SingleCellExperiment)
library(scater)
reads_sce <- SingleCellExperiment(assays=list(counts=dge$counts),  colData=dge$samples)
# remove unexpressed miRNAs
keep_feature <- rowSums(counts(reads_sce) > 0) > 0
reads_sce <- reads_sce[keep_feature, ]
reads_sce <- calculateQCMetrics(reads_sce)
# log transform
cpm(reads_sce) <- calculateCPM(reads_sce)
reads_sce <- normalize(reads_sce)
logcounts(reads_sce) <- log2(calculateCPM(reads_sce) + 1)
# visualize
hist(reads_sce$total_counts, breaks=100)  # counts per sample
hist(reads_sce$total_features, breaks=100)  # counts per miRNA
plotQC(reads_sce, type = "highest-expression")
plotQC(reads_sce, type="explanatory-variables", variables=c("Index", "Participant.ID", "Collection.Date", "Library.Generation.Set", "MiSeq.QC.Run", 'total_features', "Age", "Race", "Sex"))
```
## Examine sources of variation
```{r}
plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Library.Generation.Set", size_by = "total_features")
plotPCA(reads_sce, exprs_values = "logcounts", colour_by = "Race", shape_by="Gender", size_by = "total_features")
for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "logcounts", variable = var)
    )  + ggtitle(var) 
  }
```

## Normalize & Remove unwanted sources of variation
```{r}
library(RUVSeq)
library(ggplot2)
library(mvoutlier)
dge <- calcNormFactors(dge)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
res <- residuals(fit, type="deviance")
set <- newSeqExpressionSet(dge$counts, phenoData=dge$samples)
ruvr_sets <- list()
for(k in 1:5) {
  ruvr_sets[[k]] <- RUVr(set, row.names(dge), k=k, res)
  assay(reads_sce, paste("RUVr k=", toString(k))) <- log2(t(t(assayData(ruvr_sets[[k]])$normalizedCounts) / colSums(assayData(ruvr_sets[[k]])$normalizedCounts) * 1e6) + 1)
}
for(n in assayNames(reads_sce)) {
  print(
        plotPCA(
            reads_sce,
            colour_by = "Library.Generation.Set",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Batch",n))
  )
  print(
        plotPCA(
            reads_sce,
            colour_by = "Race",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = n
        ) + ggtitle(paste("Demographics", n))        
  )
}
```

## Detect outliers
```{r}
reads_sce <- runPCA(reads_sce, use_coldata = TRUE, detect_outliers = TRUE)
outliers <- colnames(reads_sce)[reads_sce$outlier]
head(outliers)
```

## Examine sources of variance after removing unwanted variation
```{r}
for (var in c("total_features", "Age", "Library.Generation.Set", "Index", "Participant.ID", "Source", "Race", "Gender", "Collection.Date")) {
  print(
    plotQC(reads_sce, type = "find-pcs", exprs_values = "RUVr k= 2", variable = var)
    )  + ggtitle(var) 
}
```
## Visualize top highly-expressed miRNAs male-vs-female
```{r}
library(RColorBrewer)
library(reshape2)
# Ryan's code, modified
norm.expr.matr <- exprs(reads_sce)
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.male.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Male"]))
mean.expr.female.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Gender=="Female"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.male.rank <= top_N | mean.expr.female.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Gender <- ""
for (row_num in 1:nrow(norm.expr.melt)){  # Pull the Gender values from the column metadata.
  mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
  norm.expr.melt[row_num, "Gender"] <- as.character(samples[mt_unique_id,"Gender"])
}
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.y = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Gender)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.x = element_blank()) + coord_flip()
```
## Visualize top highly-expressed miRNAs by Race
```{r}
library(RColorBrewer)
library(reshape2)
# Ryan's code, modified
norm.expr.matr <- exprs(reads_sce)
# Rank the mean expression values for plasma/serum miRs. Highest expression = 1
mean.expr.afr.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="African_American"]))
mean.expr.asn.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="Asian"]))
mean.expr.wht.rank <- rank(-1*rowMeans(norm.expr.matr[, colData(reads_sce)$Race=="White"]))
top_N <- 20
top.miRs <- row.names(norm.expr.matr)[mean.expr.afr.rank <= top_N | mean.expr.asn.rank <= top_N | mean.expr.wht.rank <= top_N] # get the names of the top miRs in plasma or serum
norm.expr.top <- norm.expr.matr[top.miRs, ] # Get the expression matrix for the top mIRs
norm.expr.melt <- reshape2::melt(norm.expr.top) # Convert your normalized expression matrix to a 3 column data.frame (row.name, col.name, expression value). Melt is in the dpylr package, I believe.
colnames(norm.expr.melt) <- c("miR.ID", "MT.Unique.ID", "norm.expr") # just so it's easier for me to tell you which columns I'm using.
norm.expr.melt$Race <- ""
for (row_num in 1:nrow(norm.expr.melt)){  # Pull the Gender values from the column metadata.
  mt_unique_id <- norm.expr.melt[row_num, ]$MT.Unique.ID
  norm.expr.melt[row_num, "Race"] <- as.character(samples[mt_unique_id,"Race"])
}
norm.expr.melt <- norm.expr.melt[norm.expr.melt$Race %in% c("African_American", "Asian", "White"),]
# This would be a simple boxplot with plasma/serum side-by-side. Overlaying the boxes over points takes a little more tweaking to get the dodge/width right, but it's doable.
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.x = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.y = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(norm.expr.melt, aes(x=reorder(miR.ID, norm.expr, FUN=median), y=norm.expr, fill=Race)) + geom_boxplot(pos="dodge", outlier.size=0.5) + 
  ggtitle("Top 20 Expressed miRNAs") + ylab("Normalized Expression") + xlab("miRNA ID") + 
  theme(panel.grid.major.y = element_line(size = 0.5, linetype = 'solid', colour = "grey"), 
        panel.grid.major.x = element_blank()) + coord_flip()
```

# DE analysis with EdgeR
```{r}
library(pheatmap)
# based on Ryan's code, pass RUVSeq-corrected data to edgeR
ruvr2 <- ruvr_sets[[2]]
design <- model.matrix(~ 0 + Race + Gender + Age + W_1 + W_2, pData(ruvr2))
dge <- estimateDisp(dge, design = design, tagwise = TRUE, robust = TRUE)
fit <- glmFit(dge, design)
```

## Contrast gender
```{r}
lrt <- glmLRT(fit, coef = "GenderMale")
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Gender <- pData(ruvr2)[,c("Gender")]
annotations <- as.data.frame(Gender)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}
```

## Contrast race
```{r}
lrt <- glmLRT(fit, contrast=makeContrasts(asn=(Asian-(African_American+White)/2),
                                          afr=(African_American-(Asian+White)/2),
                                          wht=(White-(African_American+Asian)/2)), levels = design)
results <- data.frame(topTags(lrt, n=Inf, sort.by="PValue", p.value=0.05))
sig_miRs = list()
logFC_threshold <- 1
sig_miRs[[1]] <- rownames(results[results$PValue < 0.05 & abs(results$logFC) > logFC_threshold,])  # filter by p-value and logFC
sig_miRs[[2]] <- rownames(results[results$PValue < 0.05,])
# heatmaps of significant DE miRNAs
Race <- pData(ruvr2)[,c("Race", "Gender")]
annotations <- as.data.frame(Race)
rownames(annotations) <- rownames(pData(ruvr2))
for(miR_list in sig_miRs) {
  pheatmap(norm.expr.matr[miR_list,], cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, annotation_col=annotations, main="Differentially Expressed miRNAs")
}
```

## Create a PCA plot showing Age x Gender
```{r}
plotPCA(
            reads_sce,
            colour_by = "Age",
            shape_by = "Gender",
            size_by = "total_features",
            exprs_values = "RUVr k= 2"
) + ggtitle("RUVSeq-Normalized Expression (k=2)") 
```

```{r}
save.image("diff_expr_plasma_only.RData")
```